/*
 * Copyright (C) 2015 Southern Storm Software, Pty Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "Speck.h"
#include "Crypto.h"
#include "utility/RotateUtil.h"
#include "utility/EndianUtil.h"
#include <string.h>

/**
 * \class Speck Speck.h <Speck.h>
 * \brief Speck block cipher with a 128-bit block size.
 *
 * Speck is a family of lightweight block ciphers designed by the
 * National Security Agency (NSA).  The ciphers are highly optimized
 * for software implementation on microcontrollers.
 *
 * This class implements the Speck family that uses 128-bit block sizes
 * with 128-bit, 192-bit, or 256-bit key sizes.  Other Speck families support
 * smaller block sizes of 32, 48, 64, or 96 bits but such block sizes are
 * too small for use in modern cryptosystems.
 *
 * \note Current crytoanalysis (up until 2015) has not revealed any obvious
 * weaknesses in the full-round version of Speck.  But if you are wary of
 * ciphers designed by the NSA, then use ChaCha or AES instead.
 *
 * The SpeckTiny and SpeckSmall classes provide alternative implementations
 * that have reduced RAM and flash size requirements at the cost of some
 * features and performance.
 *
 * References: https://en.wikipedia.org/wiki/Speck_%28cipher%29,
 * http://eprint.iacr.org/2013/404
 *
 * \sa SpeckTiny, SpeckSmall
 */

// The "avr-gcc" compiler doesn't do a very good job of compiling
// code involving 64-bit values.  So we have to use inline assembly.
// It also helps to break the state up into 32-bit quantities
// because "asm" supports register names like %A0, %B0, %C0, %D0
// for the bytes in a 32-bit quantity, but it does not support
// %E0, %F0, %G0, %H0 for the high bytes of a 64-bit quantity.
#if defined(__AVR__)
#define USE_AVR_INLINE_ASM 1
#endif

/**
 * \brief Constructs a Speck block cipher with no initial key.
 *
 * This constructor must be followed by a call to setKey() before the
 * block cipher can be used for encryption or decryption.
 */
Speck::Speck()
    : rounds(32)
{
}

Speck::~Speck()
{
    clean(k);
}

size_t Speck::blockSize() const
{
    return 16;
}

size_t Speck::keySize() const
{
    // Also supports 128-bit and 192-bit, but we only report 256-bit.
    return 32;
}

// Pack/unpack byte-aligned big-endian 64-bit quantities.
#define pack64(data, value) \
    do { \
        uint64_t v = htobe64((value)); \
        memcpy((data), &v, sizeof(uint64_t)); \
    } while (0)
#define unpack64(value, data) \
    do { \
        memcpy(&(value), (data), sizeof(uint64_t)); \
        (value) = be64toh((value)); \
    } while (0)

bool Speck::setKey(const uint8_t *key, size_t len)
{
#if USE_AVR_INLINE_ASM
    // Automatically generated by the genspeck tool.
    uint64_t l[4];
    uint8_t m, mb;
    if (len == 32) {
        m = 4;
        mb = 3 * 8;
    } else if (len == 24) {
        m = 3;
        mb = 2 * 8;
    } else if (len == 16) {
        m = 2;
        mb = 8;
    } else {
        return false;
    }
    rounds = 30 + m;
    uint8_t r = rounds - 1;
    __asm__ __volatile__ (
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "sbiw r30,8\n"
        "movw r10,r30\n"
        "movw r30,%A2\n"
        "ldd r8,%3\n"
        "1:\n"
        "ld __tmp_reg__,-X\n"
        "st Z+,__tmp_reg__\n"
        "dec r8\n"
        "brne 1b\n"
        "movw r26,%A2\n"
        "movw r30,r10\n"
        "clr %A2\n"
        "ldd %B2,%3\n"
        "clr r25\n"
        "ld r16,Z+\n"
        "ld r17,Z+\n"
        "ld r18,Z+\n"
        "ld r19,Z+\n"
        "ld r20,Z+\n"
        "ld r21,Z+\n"
        "ld r22,Z+\n"
        "ld r23,Z+\n"
        "2:\n"
        "add r26,%A2\n"
        "adc r27,__zero_reg__\n"
        "ld r15,X+\n"
        "ld r8,X+\n"
        "ld r9,X+\n"
        "ld r10,X+\n"
        "ld r11,X+\n"
        "ld r12,X+\n"
        "ld r13,X+\n"
        "ld r14,X+\n"
        "sub r26,%A2\n"
        "sbc r27,__zero_reg__\n"
        "sbiw r26,8\n"
        "add r8,r16\n"
        "adc r9,r17\n"
        "adc r10,r18\n"
        "adc r11,r19\n"
        "adc r12,r20\n"
        "adc r13,r21\n"
        "adc r14,r22\n"
        "adc r15,r23\n"
        "eor r8,r25\n"
        "add r26,%B2\n"
        "adc r27,__zero_reg__\n"
        "st X+,r8\n"
        "st X+,r9\n"
        "st X+,r10\n"
        "st X+,r11\n"
        "st X+,r12\n"
        "st X+,r13\n"
        "st X+,r14\n"
        "st X+,r15\n"
        "sub r26,%B2\n"
        "sbc r27,__zero_reg__\n"
        "sbiw r26,8\n"
        "lsl r16\n"
        "rol r17\n"
        "rol r18\n"
        "rol r19\n"
        "rol r20\n"
        "rol r21\n"
        "rol r22\n"
        "rol r23\n"
        "adc r16, __zero_reg__\n"
        "lsl r16\n"
        "rol r17\n"
        "rol r18\n"
        "rol r19\n"
        "rol r20\n"
        "rol r21\n"
        "rol r22\n"
        "rol r23\n"
        "adc r16, __zero_reg__\n"
        "lsl r16\n"
        "rol r17\n"
        "rol r18\n"
        "rol r19\n"
        "rol r20\n"
        "rol r21\n"
        "rol r22\n"
        "rol r23\n"
        "adc r16, __zero_reg__\n"
        "eor r16,r8\n"
        "eor r17,r9\n"
        "eor r18,r10\n"
        "eor r19,r11\n"
        "eor r20,r12\n"
        "eor r21,r13\n"
        "eor r22,r14\n"
        "eor r23,r15\n"
        "st Z+,r16\n"
        "st Z+,r17\n"
        "st Z+,r18\n"
        "st Z+,r19\n"
        "st Z+,r20\n"
        "st Z+,r21\n"
        "st Z+,r22\n"
        "st Z+,r23\n"
        "ldi r24,8\n"
        "add %A2,r24\n"
        "add %B2,r24\n"
        "ldi r24,0x1F\n"
        "and %A2,r24\n"
        "and %B2,r24\n"
        "ldd r8,%4\n"
        "inc r25\n"
        "cp r25,r8\n"
        "breq 3f\n"
        "rjmp 2b\n"
        "3:\n"
        "ldi r24,32\n"
        "4:\n"
        "st X+,__zero_reg__\n"
        "dec r24\n"
        "brne 4b\n"
        : : "z"(k), "x"(key + len), "r"(l), "Q"(mb), "Q"(r)
        : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
          "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
        , "r24", "r25"
    );
    return true;
#else
    uint64_t l[4];
    uint8_t m;
    if (len == 32) {
        m = 4;
        unpack64(l[2], key);
        unpack64(l[1], key + 8);
        unpack64(l[0], key + 16);
        unpack64(k[0], key + 24);
    } else if (len == 24) {
        m = 3;
        unpack64(l[1], key);
        unpack64(l[0], key + 8);
        unpack64(k[0], key + 16);
    } else if (len == 16) {
        m = 2;
        unpack64(l[0], key);
        unpack64(k[0], key + 8);
    } else {
        return false;
    }
    rounds = 30 + m;
    uint8_t li_in = 0;
    uint8_t li_out = m - 1;
    for (uint8_t i = 0; i < (rounds - 1); ++i) {
        l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i;
        k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
        if ((++li_in) >= m)
            li_in = 0;
        if ((++li_out) >= m)
            li_out = 0;
    }
    clean(l);
    return true;
#endif
}

void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
    // Automatically generated by the genspeck tool.
    __asm__ __volatile__ (
        "ld r15,X+\n"
        "ld r14,X+\n"
        "ld r13,X+\n"
        "ld r12,X+\n"
        "ld r11,X+\n"
        "ld r10,X+\n"
        "ld r9,X+\n"
        "ld r8,X+\n"
        "ld r23,X+\n"
        "ld r22,X+\n"
        "ld r21,X+\n"
        "ld r20,X+\n"
        "ld r19,X+\n"
        "ld r18,X+\n"
        "ld r17,X+\n"
        "ld r16,X\n"
        "1:\n"
        "add r9,r16\n"
        "adc r10,r17\n"
        "adc r11,r18\n"
        "adc r12,r19\n"
        "adc r13,r20\n"
        "adc r14,r21\n"
        "adc r15,r22\n"
        "adc r8,r23\n"
        "ld __tmp_reg__,Z+\n"
        "eor __tmp_reg__,r9\n"
        "ld r9,Z+\n"
        "eor r9,r10\n"
        "ld r10,Z+\n"
        "eor r10,r11\n"
        "ld r11,Z+\n"
        "eor r11,r12\n"
        "ld r12,Z+\n"
        "eor r12,r13\n"
        "ld r13,Z+\n"
        "eor r13,r14\n"
        "ld r14,Z+\n"
        "eor r14,r15\n"
        "ld r15,Z+\n"
        "eor r15,r8\n"
        "mov r8,__tmp_reg__\n"
        "lsl r16\n"
        "rol r17\n"
        "rol r18\n"
        "rol r19\n"
        "rol r20\n"
        "rol r21\n"
        "rol r22\n"
        "rol r23\n"
        "adc r16, __zero_reg__\n"
        "lsl r16\n"
        "rol r17\n"
        "rol r18\n"
        "rol r19\n"
        "rol r20\n"
        "rol r21\n"
        "rol r22\n"
        "rol r23\n"
        "adc r16, __zero_reg__\n"
        "lsl r16\n"
        "rol r17\n"
        "rol r18\n"
        "rol r19\n"
        "rol r20\n"
        "rol r21\n"
        "rol r22\n"
        "rol r23\n"
        "adc r16, __zero_reg__\n"
        "eor r16,r8\n"
        "eor r17,r9\n"
        "eor r18,r10\n"
        "eor r19,r11\n"
        "eor r20,r12\n"
        "eor r21,r13\n"
        "eor r22,r14\n"
        "eor r23,r15\n"
        "dec %2\n"
        "breq 2f\n"
        "rjmp 1b\n"
        "2:\n"
        "ldd r26,%A3\n"
        "ldd r27,%B3\n"
        "st X+,r15\n"
        "st X+,r14\n"
        "st X+,r13\n"
        "st X+,r12\n"
        "st X+,r11\n"
        "st X+,r10\n"
        "st X+,r9\n"
        "st X+,r8\n"
        "st X+,r23\n"
        "st X+,r22\n"
        "st X+,r21\n"
        "st X+,r20\n"
        "st X+,r19\n"
        "st X+,r18\n"
        "st X+,r17\n"
        "st X,r16\n"
        : : "x"(input), "z"(k), "r"(rounds), "Q"(output)
        : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
          "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
    );
#else
    uint64_t x, y;
    const uint64_t *s = k;
    unpack64(x, input);
    unpack64(y, input + 8);
    for (uint8_t round = rounds; round > 0; --round, ++s) {
        x = (rightRotate8_64(x) + y) ^ s[0];
        y = leftRotate3_64(y) ^ x;
    }
    pack64(output, x);
    pack64(output + 8, y);
#endif
}

void Speck::decryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
    // Automatically generated by the genspeck tool.
    __asm__ __volatile__ (
        "ld r15,X+\n"
        "ld r14,X+\n"
        "ld r13,X+\n"
        "ld r12,X+\n"
        "ld r11,X+\n"
        "ld r10,X+\n"
        "ld r9,X+\n"
        "ld r8,X+\n"
        "ld r23,X+\n"
        "ld r22,X+\n"
        "ld r21,X+\n"
        "ld r20,X+\n"
        "ld r19,X+\n"
        "ld r18,X+\n"
        "ld r17,X+\n"
        "ld r16,X\n"
        "1:\n"
        "eor r16,r8\n"
        "eor r17,r9\n"
        "eor r18,r10\n"
        "eor r19,r11\n"
        "eor r20,r12\n"
        "eor r21,r13\n"
        "eor r22,r14\n"
        "eor r23,r15\n"
        "bst r16,0\n"
        "ror r23\n"
        "ror r22\n"
        "ror r21\n"
        "ror r20\n"
        "ror r19\n"
        "ror r18\n"
        "ror r17\n"
        "ror r16\n"
        "bld r23,7\n"
        "bst r16,0\n"
        "ror r23\n"
        "ror r22\n"
        "ror r21\n"
        "ror r20\n"
        "ror r19\n"
        "ror r18\n"
        "ror r17\n"
        "ror r16\n"
        "bld r23,7\n"
        "bst r16,0\n"
        "ror r23\n"
        "ror r22\n"
        "ror r21\n"
        "ror r20\n"
        "ror r19\n"
        "ror r18\n"
        "ror r17\n"
        "ror r16\n"
        "bld r23,7\n"
        "ld __tmp_reg__,-Z\n"
        "eor __tmp_reg__,r15\n"
        "ld r15,-Z\n"
        "eor r15,r14\n"
        "ld r14,-Z\n"
        "eor r14,r13\n"
        "ld r13,-Z\n"
        "eor r13,r12\n"
        "ld r12,-Z\n"
        "eor r12,r11\n"
        "ld r11,-Z\n"
        "eor r11,r10\n"
        "ld r10,-Z\n"
        "eor r10,r9\n"
        "ld r9,-Z\n"
        "eor r9,r8\n"
        "mov r8,__tmp_reg__\n"
        "sub r9,r16\n"
        "sbc r10,r17\n"
        "sbc r11,r18\n"
        "sbc r12,r19\n"
        "sbc r13,r20\n"
        "sbc r14,r21\n"
        "sbc r15,r22\n"
        "sbc r8,r23\n"
        "dec %2\n"
        "breq 2f\n"
        "rjmp 1b\n"
        "2:\n"
        "ldd r26,%A3\n"
        "ldd r27,%B3\n"
        "st X+,r15\n"
        "st X+,r14\n"
        "st X+,r13\n"
        "st X+,r12\n"
        "st X+,r11\n"
        "st X+,r10\n"
        "st X+,r9\n"
        "st X+,r8\n"
        "st X+,r23\n"
        "st X+,r22\n"
        "st X+,r21\n"
        "st X+,r20\n"
        "st X+,r19\n"
        "st X+,r18\n"
        "st X+,r17\n"
        "st X,r16\n"
        : : "x"(input), "z"(k + rounds), "r"(rounds), "Q"(output)
        : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
          "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
    );
#else
    uint64_t x, y;
    const uint64_t *s = k + rounds - 1;
    unpack64(x, input);
    unpack64(y, input + 8);
    for (uint8_t round = rounds; round > 0; --round, --s) {
        y = rightRotate3_64(x ^ y);
        x = leftRotate8_64((x ^ s[0]) - y);
    }
    pack64(output, x);
    pack64(output + 8, y);
#endif
}

void Speck::clear()
{
    clean(k);
}
